LinearRegression

Author

Ben, Andrew, Pranati, Filip

1 Setup

library(tidyverse)
library(plotly)

2 Data

CO2 <- read_csv("co2_emissions_tonnes_per_person.csv", col_select = c(1, 152:220))
mhi <- read_csv("mhhinc.csv", col_select = c(1, 152:220))


mhi <- mhi %>%
  mutate_at(vars(-1), function(x) {
    x <- ifelse(str_detect(x, "k"), 
                as.numeric(str_replace_all(x, "k", "")) * 1000, 
                as.numeric(x))
    return(x)
})

mhi <- pivot_longer(
  mhi, cols = 2:70, names_to = "year", values_to = "mean_income",
  values_transform = list(mean_income = as.double)
    )
CO2 <- pivot_longer(CO2, cols = 2:70, names_to = "year", values_to = "co2")

mhi_CO2 <- mhi %>% 
  full_join(CO2, by = join_by(country, year))

3 Data Visualization

co2_income_plot <- mhi_CO2 %>% 
  ggplot(mapping = aes(x = mean_income, y = co2, color = country)) +
  geom_line(show.legend = FALSE, linejoin = "round") +
  labs(
    x = "Mean Household Income [International $]",
    y = "",
    title = "Carbon Dioxide Produced vs Mean Household Income"
  )

interactive_co2_income_plot <- ggplotly(co2_income_plot) %>% 
  layout(title = list(text = paste0(
    "Carbon Dioxide Produced vs Mean Household Income",
    "<br>",
    "<sup>",
    "Carbon Dioxide Produced per Person [Metric Tonnes]",
    "</sup>")))
interactive_co2_income_plot
time_plot <- mhi_CO2 %>% 
  mutate(
    co2_per_income = co2/mean_income
  ) %>% 
  ggplot(mapping = aes(x = year, y = co2_per_income, color = country, fill = country)) +
  geom_col(position = "dodge", show.legend = FALSE) +
  scale_x_discrete(
    breaks = c("1950", "1960", "1970", "1980", "1990", "2000", "2010", "2018")
    ) +
  labs(
    x = "Year",
    y = "",
    title = "Metric Tons of Carbon Dioxide Produced per Person by Mean Household Income per Year"
  )

interactive_time_plot <- ggplotly(time_plot) %>% 
  layout(title = list(text = paste0(
    "Carbon Dioxide Produced per Person by Mean Household Income per Year",
    "<br>",
    "<sup>",
    "Carbon Dioxide Produced per Person by Mean Household Income [Metric Tonnes / International $]",
    "</sup>")))
  
interactive_time_plot

4 Linear Regression

model <- lm(co2 ~ mean_income, data = mhi_CO2)
summary(model)

Call:
lm(formula = co2 ~ mean_income, data = mhi_CO2)

Residuals:
    Min      1Q  Median      3Q     Max 
-37.694  -1.833  -1.397   0.259  96.540 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.163e+00  6.448e-02   18.04   <2e-16 ***
mean_income 6.679e-04  8.515e-06   78.44   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.721 on 12264 degrees of freedom
  (1120 observations deleted due to missingness)
Multiple R-squared:  0.3341,    Adjusted R-squared:  0.334 
F-statistic:  6152 on 1 and 12264 DF,  p-value: < 2.2e-16

5 Model Fit